package com.jackingod.crawler;

import java.util.regex.Matcher;
import java.util.regex.Pattern;

public class RegexCrawlerNew extends TemplateCrawler implements Crawlable {

	@Override
	public void extractNews() {
		if (htmlContent == null)
			return;

		String regex = ".*<!--\\sbegin_t\\s-->(.*)<!--\\send_t\\s-->" 
				+ ".*<!--\\sbegin_ct\\s-->(.*)<!--\\send_ct\\s-->.*";
		
		Pattern p = Pattern.compile(regex);
		Matcher m = p.matcher(htmlContent.replaceAll(NewLine, ""));
		if (m.matches()) {
			newsTitle = m.group(1);
			newsContent = m.group(2);
			newsContent = removeTag(newsContent);
		}
	}
	

	@Override
	public void start(String url) {
		init();
		super.url = url;
		startCrawling();
		this.extractNews();
	}
	
	private String removeTag(String html) {
		return html.replaceAll("\\&[a-zA-Z]{1,10};", "")
				.replaceAll("<[^>]*>", "").replaceAll("[(/>)<]", "");
	}
}
